In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"
import seaborn as sns
from plotly.subplots import make_subplots
from sklearn import metrics
In [3]:
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

Evaluation uses relevant metrics and applies (repeated/nested) cross validation appropriately. Hyperparameter tuning is done, and models are clearly compared and interpreted.

Loading data¶

In [4]:
df = pd.read_csv('./dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# Set once, apply to all
pio.templates.default = "plotly_white"

Data description¶

In [5]:
df.head()
Out[5]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [6]:
df.info()
df.shape
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
Out[6]:
(7043, 21)
In [7]:
df.dtypes
Out[7]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
In [8]:
df.columns.values
Out[8]:
array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn'], dtype=object)
In [9]:
df = df.drop(['customerID'], axis=1)
df.head()
Out[9]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No
2 Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 Male 0 No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

Missing value¶

In [10]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors="coerce")
df.isna().sum()
Out[10]:
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64
In [11]:
# Correlation of missingness (1 = missing, 0 = present)
missing_corr = df.isna().corr()

sns.heatmap(missing_corr, annot=True, cmap='coolwarm')
plt.title('Correlation of Missingness')
plt.show()
No description has been provided for this image
In [12]:
df[np.isnan(df['TotalCharges'])]
Out[12]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
488 Female 0 Yes Yes 0 No No phone service DSL Yes No Yes Yes Yes No Two year Yes Bank transfer (automatic) 52.55 NaN No
753 Male 0 No Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 20.25 NaN No
936 Female 0 Yes Yes 0 Yes No DSL Yes Yes Yes No Yes Yes Two year No Mailed check 80.85 NaN No
1082 Male 0 Yes Yes 0 Yes Yes No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 25.75 NaN No
1340 Female 0 Yes Yes 0 No No phone service DSL Yes Yes Yes Yes Yes No Two year No Credit card (automatic) 56.05 NaN No
3331 Male 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 19.85 NaN No
3826 Male 0 Yes Yes 0 Yes Yes No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 25.35 NaN No
4380 Female 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 20.00 NaN No
5218 Male 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service One year Yes Mailed check 19.70 NaN No
6670 Female 0 Yes Yes 0 Yes Yes DSL No Yes Yes Yes Yes No Two year No Mailed check 73.35 NaN No
6754 Male 0 No Yes 0 Yes Yes DSL Yes Yes No Yes No No Two year Yes Bank transfer (automatic) 61.90 NaN No

totalCharges is 0 while tenure must be 0

In [13]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)

impute totalCharges with mean value

In [14]:
df.fillna(df['TotalCharges'].mean(), inplace=True)
In [15]:
df.isna().sum()
Out[15]:
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
In [16]:
df["SeniorCitizen"] = df["SeniorCitizen"].map({0: "No", 1: "Yes"})
In [17]:
df["InternetService"].describe()
Out[17]:
count            7032
unique              3
top       Fiber optic
freq             3096
Name: InternetService, dtype: object
In [18]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()
Out[18]:
tenure MonthlyCharges TotalCharges
count 7032.000000 7032.000000 7032.000000
mean 32.421786 64.798208 2283.300441
std 24.545260 30.085974 2266.771362
min 1.000000 18.250000 18.800000
25% 9.000000 35.587500 401.450000
50% 29.000000 70.350000 1397.475000
75% 55.000000 89.862500 3794.737500
max 72.000000 118.750000 8684.800000

Visualization¶

Gender and churn distribution¶

In [19]:
# Create individual pie charts with px
fig_gender = px.pie(df, names='gender', title='Gender')
fig_churn = px.pie(df, names='Churn', title='Churn')

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add traces from px figures
fig.add_trace(fig_gender.data[0], row=1, col=1)
fig.add_trace(fig_churn.data[0], row=1, col=2)

# Update for donut style
fig.update_traces(hole=0.4, hoverinfo="label+percent+name", textfont_size=16)

# Better title and annotations
fig.update_layout(
    title_text="Gender and Churn Distributions",
    title_x=0.5,
    annotations=[
        dict(text='Gender', x=0.18, y=0.5, font_size=20, showarrow=False),
        dict(text='Churn', x=0.82, y=0.5, font_size=20, showarrow=False)
    ],
    font=dict(size=14)
)

fig.show()

Churn rate by gender¶

In [20]:
df[df["Churn"] == "No"]["gender"].value_counts()
Out[20]:
gender
Male      2619
Female    2544
Name: count, dtype: int64
In [21]:
# Create a crosstab
churn_gender = pd.crosstab(df['Churn'], df['gender'], normalize='index') * 100

# Plot
ax = churn_gender.plot(kind='bar', stacked=True, color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Percentage (%)')
ax.set_title('Churn Rate by Gender')
for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', label_type='center')
plt.legend(title='Gender')
plt.show()
No description has been provided for this image
In [22]:
churn_gender_counts = pd.crosstab(df['Churn'], df['gender'])

ax = churn_gender_counts.plot(kind='bar', color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Count')
ax.set_title('Number of Customers by Churn and Gender')
for container in ax.containers:
    ax.bar_label(container)
plt.legend(title='Gender')
plt.show()
No description has been provided for this image

Contract distribution¶

In [23]:
fig = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

Payment Method Distribution¶

In [24]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

Payment for Churn¶

In [25]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

Internet Service and Gender of Churn¶

In [26]:
df["InternetService"].unique()
Out[26]:
array(['DSL', 'Fiber optic', 'No'], dtype=object)
In [27]:
df[df["gender"] == "Male"][["InternetService", "Churn"]].value_counts()
Out[27]:
InternetService  Churn
DSL              No       992
Fiber optic      No       910
No               No       717
Fiber optic      Yes      633
DSL              Yes      240
No               Yes       57
Name: count, dtype: int64
In [28]:
fig = go.Figure()

# Define categories
churn_labels = ['Churn:No', 'Churn:Yes']
genders = ['Female', 'Male']

# Data: [ [DSL_F, DSL_M], [Fiber_F, Fiber_M], [NoInternet_F, NoInternet_M] ] per churn group
data = {
    'DSL': {
        'Churn:No': [965, 992],
        'Churn:Yes': [219, 240]
    },
    'Fiber optic': {
        'Churn:No': [889, 910],
        'Churn:Yes': [664, 633]
    },
    'No Internet': {
        'Churn:No': [690, 717],
        'Churn:Yes': [56, 57]
    }
}

# Build x-axis labels: "Churn:No-Female", "Churn:No-Male".
x_labels = [f"{churn}-{gender}" for churn in churn_labels for gender in genders]

# Add a trace for each InternetService (stacked)
for service, churn_data in data.items():
    y_values = []
    for churn in churn_labels:
        y_values.extend(churn_data[churn])  # [F, M] for this churn group
    fig.add_trace(go.Bar(
        x=x_labels,
        y=y_values,
        name=service,
        text=y_values,
        textposition='auto'
    ))

fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")

fig.show()

Dependents churn distribution¶

In [29]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

Partner Churn¶

In [30]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

SeniorCitizen distribution¶

In [31]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

Online security churn¶

In [32]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

paperless billing¶

In [33]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Chrun distribution w.r.t. Paperless Billing</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

TechSupport distribution¶

In [34]:
fig = px.histogram(df, x="Churn", color="TechSupport", barmode="group",
                   title="<b>Chrun distribution w.r.t. TechSupport</b>", text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
In [35]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="PhoneService", title="<b>Chrun distribution w.r.t. Phone Service</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
In [36]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No')],
                 color="Red", fill=True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes')],
                 ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')
Out[36]:
Text(0.5, 1.0, 'Distribution of monthly charges by churn')
No description has been provided for this image
In [37]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No')],
                 color="Red", fill=True)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes')],
                 ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')
Out[37]:
Text(0.5, 1.0, 'Distribution of monthly charges by churn')
No description has been provided for this image
In [38]:
fig = px.box(df, x='Churn', y='tenure')

# Update yaxis properties
fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Churn', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=750, height=600,
                  title_font=dict(size=25, family='Courier'),
                  title='Tenure vs Churn',
                  )

fig.show()
In [39]:
plt.figure(figsize=(25, 10))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2,
                 cmap='coolwarm', vmin=-1, vmax=1)
No description has been provided for this image

Data preprocessing¶

In [40]:
df.nunique()
Out[40]:
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
dtype: int64
In [41]:
for col in df.columns:
    n_unique = df[col].nunique(dropna=True)
    if n_unique < 5:
        uniques = df[col].dropna().unique()
        print(f"{col}: {list(uniques)}")
gender: ['Female', 'Male']
SeniorCitizen: ['No', 'Yes']
Partner: ['Yes', 'No']
Dependents: ['No', 'Yes']
PhoneService: ['No', 'Yes']
MultipleLines: ['No phone service', 'No', 'Yes']
InternetService: ['DSL', 'Fiber optic', 'No']
OnlineSecurity: ['No', 'Yes', 'No internet service']
OnlineBackup: ['Yes', 'No', 'No internet service']
DeviceProtection: ['No', 'Yes', 'No internet service']
TechSupport: ['No', 'Yes', 'No internet service']
StreamingTV: ['No', 'Yes', 'No internet service']
StreamingMovies: ['No', 'Yes', 'No internet service']
Contract: ['Month-to-month', 'One year', 'Two year']
PaperlessBilling: ['Yes', 'No']
PaymentMethod: ['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)']
Churn: ['No', 'Yes']

Encoding¶

In [42]:
binary_cols = ['Partner', 'Dependents', 'PhoneService',
               'PaperlessBilling', 'Churn', 'SeniorCitizen']

for col in binary_cols:
    df[col] = df[col].map({'No': 0, 'Yes': 1})

df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

categorical_cols = [
    'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
In [43]:
churn_corr = df.corr()['Churn'].sort_values(ascending=False)
churn_corr = churn_corr.drop('Churn')

plt.figure(figsize=(16, 9))

# Use a diverging colormap: red (positive), white (0), blue (negative)
colors = plt.cm.RdYlBu_r((churn_corr + 1) / 2)  # Normalize to [0,1] for colormap

bars = plt.bar(churn_corr.index, churn_corr.values, color=colors, edgecolor='black', linewidth=0.5)

plt.xticks(rotation=60, ha='right', fontsize=11)
plt.yticks(fontsize=11)

plt.ylabel('Correlation with Churn', fontsize=13)
plt.title('Feature Correlation with Churn (Higher = More Likely to Churn)', fontsize=16, weight='bold')
plt.axhline(0, color='gray', linewidth=0.8)

for bar, corr in zip(bars, churn_corr.values):
    if abs(corr) > 0.1:
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + (0.01 if bar.get_height() >= 0 else -0.02),
            f'{corr:.2f}',
            ha='center', va='bottom' if bar.get_height() >= 0 else 'top',
            fontsize=9, fontweight='bold'
        )

plt.tight_layout()
plt.show()
No description has been provided for this image

Split train/test set¶

In [44]:
y = df['Churn'].values
X = df.drop(columns=['Churn'])
In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40, stratify=y)
In [46]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8, 3))
    plt.title("Distribution for {}".format(feature))
    sns.histplot(frame[feature], color=color)
In [47]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [48]:
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')), columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='c')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [49]:
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

Model¶

In [50]:
def predict_result(model, model_name, X_test, y_test):
    # Get best estimator
    best = model.best_estimator_

    # Predictions
    predicted_y = best.predict(X_test)
    y_pred_prob = best.predict_proba(X_test)[:, 1]

    # Final metrics
    accuracy = best.score(X_test, y_test)
    auc = roc_auc_score(y_test, y_pred_prob)

    print(f"\nFinal Test Performance {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, predicted_y))
    return best, predicted_y


def show_confusion_matrix(test, predicted_y, model_name):
    # Confusion Matrix
    plt.figure(figsize=(4, 3))
    sns.heatmap(
        confusion_matrix(test, predicted_y),
        annot=True,
        fmt="d",
        linecolor="k",
        linewidths=3,
        cmap="Blues"
    )
    plt.title(f"{model_name.upper()} CONFUSION MATRIX", fontsize=14)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()


def show_ROC(model, y_test, model_name):
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    plt.figure(figsize=(6, 5))
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

    # use roc_auc_score
    auc_test = roc_auc_score(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_test:.3f})', color="r", linewidth=2)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name.upper()} ROC CURVE', fontsize=16)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

def importance_feature(best, model_name):
    importances = best.feature_importances_
    importance = pd.DataFrame({
        'feature': X_train.columns.tolist(),
        'importance': importances
    }).sort_values(by='importance', ascending=False)

    print(importance.head(10))

    plt.figure(figsize=(10, 8))
    sns.barplot(data=importance.head(10), x='importance', y='feature')
    plt.title(f'{model_name} Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

Knn¶

In [51]:
knn_model = KNeighborsClassifier(n_neighbors = 11)
knn_model.fit(X_train,y_train)
predict_knn_y = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)
print("KNN accuracy:",accuracy_knn)
print(classification_report(y_test, predict_knn_y))
KNN accuracy: 0.7767772511848341
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1549
           1       0.58      0.55      0.57       561

    accuracy                           0.78      2110
   macro avg       0.71      0.71      0.71      2110
weighted avg       0.77      0.78      0.77      2110

In [52]:
show_confusion_matrix(y_test, predict_knn_y, "KNN")
show_ROC(knn_model, y_test, "KNN")
No description has been provided for this image
No description has been provided for this image

Svm¶

In [ ]:
svc_model = SVC(probability=True, random_state=1)
svc_model.fit(X_train, y_train)

# Make predictions
predict_svc_y = svc_model.predict(X_test)

# Evaluate accuracy
accuracy_svc = svc_model.score(X_test, y_test)
print("SVM accuracy is:", accuracy_svc)
print(classification_report(y_test, predict_svc_y))
In [122]:
show_confusion_matrix(y_test, predict_svc_y, "svm")
show_ROC(svc_model, y_test, "svm")
No description has been provided for this image
No description has been provided for this image

ramdom tree¶

In [123]:
rt_model = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1,
                                  random_state=50, max_features="sqrt",
                                  max_leaf_nodes=30)
rt_model.fit(X_train, y_train)

# Make predictions
prediction_rf_y = rt_model.predict(X_test)

# Print performance metrics
print(metrics.accuracy_score(y_test, prediction_rf_y))
print(classification_report(y_test, prediction_rf_y))
0.8085308056872038
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1549
           1       0.69      0.50      0.58       561

    accuracy                           0.81      2110
   macro avg       0.76      0.71      0.73      2110
weighted avg       0.80      0.81      0.80      2110

In [124]:
show_confusion_matrix(y_test, prediction_rf_y, "random forest")
show_ROC(rt_model, y_test, "random forest")
No description has been provided for this image
No description has been provided for this image

logistic regression¶

In [125]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
prediction_lr_y = lr_model.predict(X_test)

accuracy_lr = lr_model.score(X_test, y_test)
print("Logistic Regression accuracy is :", accuracy_lr)
report = classification_report(y_test, prediction_lr_y)
print(report)
Logistic Regression accuracy is : 0.8113744075829384
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1549
           1       0.67      0.58      0.62       561

    accuracy                           0.81      2110
   macro avg       0.76      0.74      0.75      2110
weighted avg       0.80      0.81      0.81      2110

In [126]:
show_confusion_matrix(y_test, prediction_lr_y, "logistic regression")
show_ROC(lr_model, y_test, "logistic regression")
No description has been provided for this image
No description has been provided for this image

Decision tree¶

In [127]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

prediction_dt_y = dt_model.predict(X_test)

accuracy_dt = dt_model.score(X_test, y_test)
print("Decision Tree accuracy is:", accuracy_dt)
print(classification_report(y_test, prediction_dt_y))
Decision Tree accuracy is: 0.7317535545023697
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      1549
           1       0.50      0.55      0.52       561

    accuracy                           0.73      2110
   macro avg       0.66      0.68      0.67      2110
weighted avg       0.74      0.73      0.74      2110

In [128]:
show_confusion_matrix(y_test, prediction_dt_y, "decision tree")
show_ROC(dt_model, y_test, "decision tree")
No description has been provided for this image
No description has been provided for this image

Compare models¶

In [129]:
models = {
    'Logistic Regression': lr_model,
    'KNN': knn_model,
    'Random Forest': rt_model,
    'Decision Tree': dt_model,
    'SVM': svc_model
}
  • Best Overall Performance: Logistic Regression achieves the highest Accuracy (0.8114) and F1-Score (0.6188), overall balance for this dataset.
  • Best Discriminative Power: Random Forest has the highest AUC-ROC (0.8589), with raw accuracy or low recall.
  • Best Precision: Random Forest also has the highest Precision (0.6938), nearly 70%.
  • Best Recall: KNN has the highest Recall (0.5526), followed closely by Decision Tree (0.5544). These models are better at finding churners, but at the cost of low precision.
  • Worst Performer: Decision Tree performs significantly worse than the others in almost every metric, especially AUC-ROC (0.6753). It may be overfitting to the training data.
In [130]:
results = []

for name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1,
        'AUC-ROC': auc
    })

# Create DataFrame
results_df = pd.DataFrame(results).round(4)
print(results_df)
                 Model  Accuracy  Precision  Recall  F1-Score  AUC-ROC
0  Logistic Regression    0.8114     0.6687  0.5758    0.6188   0.8578
1                  KNN    0.7768     0.5849  0.5526    0.5683   0.8183
2        Random Forest    0.8085     0.6938  0.5009    0.5818   0.8589
3        Decision Tree    0.7318     0.4960  0.5544    0.5236   0.6753
4                  SVM    0.8090     0.6846  0.5223    0.5925   0.8059
  • The purple bars (AUC-ROC) show Random Forest leading, followed by Logistic Regression and SVM.
  • The blue bars (Accuracy) show Logistic Regression, SVM, and Random Forest clustered together at the top.
  • The red bars (Recall) highlight KNN and Decision Tree as having relatively higher recall compared to their precision.
In [131]:
import plotly.express as px

# Melt for easy plotting
melted = results_df.melt(id_vars='Model',
                         value_vars=['Accuracy', 'Recall', 'F1-Score', 'AUC-ROC'],
                         var_name='Metric', value_name='Score')

fig = px.bar(melted, x='Model', y='Score', color='Metric', barmode='group',
             title='Model Performance Comparison',
             height=500, text_auto=True)
fig.update_layout(yaxis_range=[0, 1])
fig.show()
  • All models perform significantly better than the random classifier.
  • The Random Forest curve is consistently above the others, especially in the top-left region.
  • The Logistic Regression curve is very close to Random Forest, similar performance.
  • The Decision Tree curve is lower, especially in the middle range, poor AUC score.
In [132]:
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
No description has been provided for this image
  • For a Balanced Approach: Logistic Regression. offers the best overall accuracy and F1-score, and its coefficients can be easily interpreted to understand which features drive churn.
  • For Maximizing Detection (High Recall): KNN or Decision Tree, though be aware they have lower precision.
  • For Minimizing False Positives (High Precision): Random Forest.
  • For Best Overall Ranking Ability (Highest AUC): Random Forest.